In [83]:
%matplotlib inline
In [140]:
from datetime import datetime
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
In [85]:
import pylab as plt
In [86]:
# plot a random walk
plt.plot(np.random.randn(1000).cumsum())
Out[86]:
In [87]:
np.arange(5)
Out[87]:
In [88]:
np.arange(5)[2]
Out[88]:
In [89]:
np.arange(5)[2:]
Out[89]:
In [90]:
## use pandas to add an index tothis array
In [91]:
index = ['a', 'b', 'c', 'd', 'e']
In [92]:
s = Series(np.arange(5), index=index)
s
Out[92]:
In [93]:
s.index
Out[93]:
In [94]:
s.values
Out[94]:
In [95]:
# Series objects behave like numpy arrays
s * 2
Out[95]:
In [96]:
s * s
Out[96]:
In [97]:
s[:3]
Out[97]:
In [98]:
s['b']
Out[98]:
In [99]:
s['b':]
Out[99]:
In [100]:
s[[1, 0, 2]]
Out[100]:
In [101]:
s[['b', 'd', 'e']]
Out[101]:
In [102]:
dates = pd.date_range('2012-07-16', '2012-07-21')
In [103]:
dates
Out[103]:
In [104]:
# time series of temperatures (in F) in Austin
atemps = Series([101.5, 98, 95, 99, 100, 92], index=dates)
In [105]:
atemps.index
Out[105]:
In [106]:
idx = atemps.index[2]
idx
Out[106]:
In [107]:
atemps[idx]
Out[107]:
In [108]:
# another time series of temperatures (in F) in San Diego
sdtemps = Series([77, 73, 77, 78, 75, 79], index=dates)
In [109]:
temps = DataFrame({'austin': atemps, 'san diego': sdtemps})
In [110]:
temps
Out[110]:
In [111]:
temps.index
Out[111]:
In [112]:
temps.columns
Out[112]:
In [113]:
temps['san diego']
Out[113]:
In [114]:
temps['difference'] = temps['austin'] - temps['san diego']
In [115]:
temps
Out[115]:
In [116]:
del temps['difference']
In [117]:
temps
Out[117]:
In [118]:
temps.austin
Out[118]:
In [119]:
# access the third row of the dataframe
temps.ix[2]
Out[119]:
In [120]:
# or use the index to get the same row
ridx = temps.index[2]
temps.ix[ridx]
Out[120]:
In [121]:
# access the 'austin' column of the third row
temps.ix[ridx, 'austin']
Out[121]:
In [122]:
# access the 'austin' column of all rows (beginning w/ the third)
temps.ix[ridx:, 'austin']
Out[122]:
In [123]:
# access the 'austin' and 'san diego' columns of all rows (beginning w/ the third)
temps.ix[ridx:, ['austin', 'san diego']]
Out[123]:
In [124]:
# mean of DataFrame columns
temps.mean()
Out[124]:
In [125]:
# mean of DataFrame rows
temps.mean(1)
Out[125]:
In [126]:
# deviations from the column mean
temps - temps.mean()
Out[126]:
In [127]:
# select all rows where the temperature in Austin is above 100 F
temps[temps.austin > 100]
Out[127]:
In [129]:
# starts at 53m
# Timestamp is a subclass of datetime.datetime which supports nanoseconds
pd.Timestamp.mro()
Out[129]:
In [136]:
pd.Timestamp('2016')
Out[136]:
In [151]:
pd.Timestamp('7/2/2012 10:06')
Out[151]:
In [154]:
# month-first vs. day-first date parsing
print pd.to_datetime('7/2/2012')
print pd.to_datetime('7/2/2012', dayfirst=True)
In [137]:
stamp = pd.Timestamp('17.7.2012 1006')
stamp
Out[137]:
In [138]:
stamp.strftime("%Y%m%d")
Out[138]:
In [139]:
stamp.value # nanosecond timestamp
Out[139]:
In [141]:
dates = ['2012-07-16', '2012-07-18', '2012-07-20']
In [144]:
date_series = Series(range(3), index=dates)
date_series
Out[144]:
In [145]:
date_series.index[0] # the index is still a string!
Out[145]:
In [146]:
pd.DatetimeIndex(dates)
Out[146]:
In [149]:
# create a series with a proper datetime index
date_series = Series(range(3), index=pd.DatetimeIndex(dates))
print date_series
print date_series.index[0]
In [157]:
pd.date_range('2012-07-12', periods=5)
Out[157]:
In [159]:
rng = pd.date_range('2012-07-12', periods=1000)
rng
Out[159]:
In [160]:
ts = Series(np.arange(1000), index=rng)
ts
Out[160]:
In [161]:
ts[datetime(2013, 1, 29)]
Out[161]:
In [163]:
ts['2013-01-29']
Out[163]:
In [162]:
ts[ts.index[201]]
Out[162]:
In [165]:
# attention: slicing using datestrings includes the end element!
# this will only work on ordered time series!
ts[:'2012-07-20']
Out[165]:
In [ ]: